Data Preparation

Import Dataset

Data Preprocess

Defining Preprocess Recipe

# define preprocess recipe from train dataset
rec <- recipe(sentiment ~ ., data = training(splitted)) %>% 
  step_rm(-sentiment, -tweet) %>%
  step_string2factor(sentiment, levels = c("negative", "neutral", "positive"), skip = TRUE) %>%
  step_downsample(sentiment, ratio = 1/1, seed = 100) %>%
  step_mutate(tweet = str_squish(tweet)) %>% 
  step_mutate(tweet = replace_html(tweet, symbol = FALSE)) %>% 
  step_mutate(tweet = replace_kern(tweet)) %>% 
  step_mutate(tweet = replace_word_elongation(tweet)) %>% 
  step_mutate(tweet = replace_date(tweet, replacement = "datewords")) %>% 
  step_mutate(tweet = replace_time(tweet, replacement = "timewords")) %>% 
  step_mutate(tweet = replace_money(tweet, replacement = "moneywords")) %>% 
  step_mutate(tweet = replace_ordinal(tweet, remove = FALSE)) %>% 
  step_mutate(tweet = replace_number(tweet, remove = FALSE)) %>% 
  step_mutate(tweet = replace_internet_slang(tweet)) %>% 
  step_mutate(tweet = replace_contraction(tweet)) %>% 
  step_mutate(tweet = replace_emoji(tweet)) %>% 
  step_mutate(tweet = replace_symbol(tweet)) %>% 
  step_mutate(tweet = str_squish(tweet)) %>% 
  step_mutate(tweet = str_replace_all(tweet, "(<.*>)", "")) %>% 
  step_mutate(tweet = str_replace_all(tweet, "[:digit:]", "")) %>% 
  step_tokenize(tweet, token = "words") %>%
  step_stem(tweet) %>%
  step_stopwords(tweet) %>%
  step_tokenfilter(tweet, max_tokens = 256) %>%
  step_tfidf(tweet) %>%
  prep(string_as_factor = FALSE)

# get train and test dataset
data_train <- juice(rec)
data_test <- bake(rec, testing(splitted))

# quick check
head(data_train, 10)

Model Fitting

Defining Model Specifications

#> Boosted Tree Model Specification (classification)
#> 
#> Main Arguments:
#>   mtry = 32
#>   trees = 500
#>   min_n = 1
#>   tree_depth = 8
#>   learn_rate = 0.1
#>   loss_reduction = 0.1
#>   sample_size = 0.8
#> 
#> Engine-Specific Arguments:
#>   nthread = parallel::detectCores()/2
#> 
#> Computational engine: xgboost

Model Fitting

#> parsnip model object
#> 
#> ##### xgb.Booster
#> raw: 2.7 Mb 
#> call:
#>   xgboost::xgb.train(params = list(eta = 0.1, max_depth = 8, gamma = 0.1, 
#>     colsample_bytree = 0.125, min_child_weight = 1, subsample = 0.8), 
#>     data = x, nrounds = 500, verbose = 0, objective = "multi:softprob", 
#>     num_class = 3L, nthread = 6)
#> params (as set within xgb.train):
#>   eta = "0.1", max_depth = "8", gamma = "0.1", colsample_bytree = "0.125", min_child_weight = "1", subsample = "0.8", objective = "multi:softprob", num_class = "3", nthread = "6", silent = "1"
#> xgb.attributes:
#>   niter
#> # of features: 256 
#> niter: 500
#> nfeatures : 256

Model Evaluation

Predict on Test Dataset

Confusion Matrix

ROC Curve

Precision-Recall Curve